import ma.const

import re
import os
import os.path
from urllib.request import urlopen
from urllib.request import urlretrieve

main_copy_dir = "C:\\gutenberg\\"
base_url = "http://www.gutenberg.org"
catg_gb_url = "http://www.gutenberg.org/wiki/Category:Bookshelf"
catg_links = {}
job_id = 5
exclude_link_list = ["Gutenberg:", "Category:", "Special:", "http://www.ibiblio.org"]

# process main page to get all the category links
main_page = urlopen(catg_gb_url).read()
links = re.findall(r'<a\s+href="(\S+)"\s+title="([^"]+)"', main_page)

for link in links:
    include = True
    for exclude in exclude_link_list:
        if link[0].find(exclude) != -1:
            include = False
    if include:
        catg_name = link[1].replace('(Bookshelf)', '').strip()
        print(catg_name)
        catg_links[catg_name] = base_url + link[0]
        
print('Categories extracted:', len(catg_links))
total_counter = 0
running = False
category = 0
# process each category
for link in catg_links:
    category += 1
    print('Processing category', category, ':', link)
    if running == False:
        inp = input('Do you want to skip this category:')
        if inp.lower() != 'y':
            inp = input('What is the last downloaded book no:')
            total_counter = int(inp) + 1
            running = True
        else:
            continue

    # create dir for the category
    catg_path = main_copy_dir + link + os.sep
    if not os.path.isdir(catg_path):
        os.makedirs(catg_path)
    
    current_link = catg_links[link]
    catg_page = urlopen(current_link).read()
    book_links = re.findall(r'<a\s+href="(http://www.gutenberg.org/ebooks/[^"]+)"', catg_page)
    counter = 0
    for book_link in book_links:
        counter += 1
        
        book_page = urlopen(book_link).read() 
        book_download_links = re.findall(r'<td[^>]*>Plain text</td><td[^>]*>[^<]*</td><td[^>]*>none</td><td[^>]*>[^<]*</td><td[^>]*><a\s+href="(\S+)"', book_page)
        if len(book_download_links) > 0:
            book_download_link = base_url + book_download_links[0]
            downloaded = False
            
            filename = catg_path + ma.const.JobsXmlData.get_str_data(ma.const.xml_map_input_filename, job_id, total_counter)
            if os.path.isfile(filename):
                downloaded_filesize = int(os.path.getsize(filename))
                download_size = int(urlopen(book_download_link).info().getheader('content-length'))
                print(downloaded_filesize, download_size)
                if downloaded_filesize == download_size:
                    print('This file has already been downloaded:', book_link)
                    downloaded = True

            if downloaded == False:    
                (filename, headers) = urlretrieve(book_download_link, filename)

            total_counter += 1
        else:
            print('Warning: Book not downloadable:', book_link)
        
        print('Books processed in this', category, 'category:', counter)
        
